library(stringi)
library(stringr)
library(gutenbergr)
## Warning: package 'gutenbergr' was built under R version 3.4.3
library(tm)
## Warning: package 'tm' was built under R version 3.4.3
## Loading required package: NLP
library(tidytext)
## Warning: package 'tidytext' was built under R version 3.4.4
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(corpus)
## Warning: package 'corpus' was built under R version 3.4.3
library(ggplot2)
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
##
## annotate
library(highcharter)
## Highcharts (www.highcharts.com) is a Highsoft software product which is
## not free for commercial and Governmental use
library(wordcloud2)
library(tidyr)
## Warning: package 'tidyr' was built under R version 3.4.3
book_index = c(564, 580, 700, 730, 766, 786, 821, 883, 917, 963, 967, 968, 1023, 1400)
dickens = gutenberg_download(book_index)
## Determining mirror for Project Gutenberg from http://www.gutenberg.org/robot/harvest
## Using mirror http://aleph.gutenberg.org
stop_words = rbind(stop_words, c("miss", "SMART"))
stop_words = rbind(stop_words, c("sir", "SMART"))
all_words = dickens %>% unnest_tokens(word, text)
all_words %>% anti_join(stop_words, by = "word") %>% count(word, sort = TRUE) %>% top_n(20) %>% hchart(., 'column', hcaes(x = word, y = n)) %>% hc_add_theme(hc_theme_monokai()) %>% hc_yAxis(title = list(text = "Number of Occurance"))
## Selecting by n
all_words %>% anti_join(stop_words, by = 'word') %>% count(word, sort = TRUE) %>% top_n(200) %>% wordcloud2(., size=0.18, figPath = 'dickens.png')
## Selecting by n
# Q3
for (i in book_index){
total_text = gutenberg_download(i)
total_text = str_replace_all(total_text, '\n','')
total_text = str_replace_all(total_text, '\r','')
total_text = str_replace_all(total_text, '\"','')
total_text = str_replace_all(total_text, 'xa0xa0','')
total_text = str_replace_all(total_text, '[[:punct:]]','')
smalls = str_extract_all(total_text, '\\b[a-z]\\w+')[[2]]
smalls = data.frame(smalls)
colnames(smalls) = c('word')
capitals = str_extract_all(total_text, '\\b[A-Z]\\w+')[[2]]
capitals = sapply(capitals, tolower)
capitals = data.frame(capitals)
colnames(capitals) = c('word')
df = capitals %>% anti_join(smalls) %>% anti_join(stop_words) %>% group_by(word) %>% summarise(n = n()) %>% arrange(desc(n)) %>% top_n(5)
print( ggplot(data = df) + geom_bar(aes(x = reorder(word, -n), y = n), stat = 'identity') + ggtitle(unique(df$gutenberg_id)))
}
## Warning in stri_replace_all_regex(string, pattern,
## fix_replacement(replacement), : argument is not an atomic vector; coercing
## Joining, by = "word"
## Warning: Column `word` joining factors with different levels, coercing to
## character vector
## Joining, by = "word"
## Warning: Column `word` joining factor and character vector, coercing into
## character vector
## Selecting by n
## Warning: Unknown or uninitialised column: 'gutenberg_id'.
## Warning: argument is not an atomic vector; coercing
## Joining, by = "word"
## Warning: Column `word` joining factors with different levels, coercing to
## character vector
## Joining, by = "word"
## Warning: Column `word` joining factor and character vector, coercing into
## character vector
## Selecting by n
## Warning: Unknown or uninitialised column: 'gutenberg_id'.
## Warning in stri_replace_all_regex(string, pattern,
## fix_replacement(replacement), : argument is not an atomic vector; coercing
## Joining, by = "word"
## Warning: Column `word` joining factors with different levels, coercing to
## character vector
## Joining, by = "word"
## Warning: Column `word` joining factor and character vector, coercing into
## character vector
## Selecting by n
## Warning: Unknown or uninitialised column: 'gutenberg_id'.
## Warning in stri_replace_all_regex(string, pattern,
## fix_replacement(replacement), : argument is not an atomic vector; coercing
## Joining, by = "word"
## Warning: Column `word` joining factors with different levels, coercing to
## character vector
## Joining, by = "word"
## Warning: Column `word` joining factor and character vector, coercing into
## character vector
## Selecting by n
## Warning: Unknown or uninitialised column: 'gutenberg_id'.
## Warning in stri_replace_all_regex(string, pattern,
## fix_replacement(replacement), : argument is not an atomic vector; coercing
## Joining, by = "word"
## Warning: Column `word` joining factors with different levels, coercing to
## character vector
## Joining, by = "word"
## Warning: Column `word` joining factor and character vector, coercing into
## character vector
## Selecting by n
## Warning: Unknown or uninitialised column: 'gutenberg_id'.
## Warning in stri_replace_all_regex(string, pattern,
## fix_replacement(replacement), : argument is not an atomic vector; coercing
## Joining, by = "word"
## Warning: Column `word` joining factors with different levels, coercing to
## character vector
## Joining, by = "word"
## Warning: Column `word` joining factor and character vector, coercing into
## character vector
## Selecting by n
## Warning: Unknown or uninitialised column: 'gutenberg_id'.
## Warning in stri_replace_all_regex(string, pattern,
## fix_replacement(replacement), : argument is not an atomic vector; coercing
## Joining, by = "word"
## Warning: Column `word` joining factors with different levels, coercing to
## character vector
## Joining, by = "word"
## Warning: Column `word` joining factor and character vector, coercing into
## character vector
## Selecting by n
## Warning: Unknown or uninitialised column: 'gutenberg_id'.
## Warning in stri_replace_all_regex(string, pattern,
## fix_replacement(replacement), : argument is not an atomic vector; coercing
## Joining, by = "word"
## Warning: Column `word` joining factors with different levels, coercing to
## character vector
## Joining, by = "word"
## Warning: Column `word` joining factor and character vector, coercing into
## character vector
## Selecting by n
## Warning: Unknown or uninitialised column: 'gutenberg_id'.
## Warning in stri_replace_all_regex(string, pattern,
## fix_replacement(replacement), : argument is not an atomic vector; coercing
## Joining, by = "word"
## Warning: Column `word` joining factors with different levels, coercing to
## character vector
## Joining, by = "word"
## Warning: Column `word` joining factor and character vector, coercing into
## character vector
## Selecting by n
## Warning: Unknown or uninitialised column: 'gutenberg_id'.
## Warning in stri_replace_all_regex(string, pattern,
## fix_replacement(replacement), : argument is not an atomic vector; coercing
## Joining, by = "word"
## Warning: Column `word` joining factors with different levels, coercing to
## character vector
## Joining, by = "word"
## Warning: Column `word` joining factor and character vector, coercing into
## character vector
## Selecting by n
## Warning: Unknown or uninitialised column: 'gutenberg_id'.
## Warning in stri_replace_all_regex(string, pattern,
## fix_replacement(replacement), : argument is not an atomic vector; coercing
## Joining, by = "word"
## Warning: Column `word` joining factors with different levels, coercing to
## character vector
## Joining, by = "word"
## Warning: Column `word` joining factor and character vector, coercing into
## character vector
## Selecting by n
## Warning: Unknown or uninitialised column: 'gutenberg_id'.
## Warning in stri_replace_all_regex(string, pattern,
## fix_replacement(replacement), : argument is not an atomic vector; coercing
## Joining, by = "word"
## Warning: Column `word` joining factors with different levels, coercing to
## character vector
## Joining, by = "word"
## Warning: Column `word` joining factor and character vector, coercing into
## character vector
## Selecting by n
## Warning: Unknown or uninitialised column: 'gutenberg_id'.
## Warning in stri_replace_all_regex(string, pattern,
## fix_replacement(replacement), : argument is not an atomic vector; coercing
## Joining, by = "word"
## Warning: Column `word` joining factors with different levels, coercing to
## character vector
## Joining, by = "word"
## Warning: Column `word` joining factor and character vector, coercing into
## character vector
## Selecting by n
## Warning: Unknown or uninitialised column: 'gutenberg_id'.
## Warning in stri_replace_all_regex(string, pattern,
## fix_replacement(replacement), : argument is not an atomic vector; coercing
## Joining, by = "word"
## Warning: Column `word` joining factors with different levels, coercing to
## character vector
## Joining, by = "word"
## Warning: Column `word` joining factor and character vector, coercing into
## character vector
## Selecting by n
## Warning: Unknown or uninitialised column: 'gutenberg_id'.
برای این که فضا را مورد بررسی قرار دهیم باید کتابها را جداگانه بررسی کنیم. برای مثال اگر کتاب با شماره ۵۶۴ را در نظر بگیریم تعداد کلمات مثبت ظاهر شده بسیار بیشتر و پرتکرار تر هستند. در نتیجه فضای احساسی این داستان مثبت است.
posneg = get_sentiments("nrc") %>% filter(sentiment %in% c('positive', 'negative'))
plots = dickens %>% group_by(gutenberg_id) %>% unnest_tokens(word, text) %>% anti_join(stop_words, by = 'word') %>% inner_join(posneg) %>% count(word, sort = T) %>% top_n(40) %>% do(p = ggplot(data = .) + geom_bar(aes(x = reorder(word, -n), y = n), stat = 'identity') + ggtitle(unique(.$gutenberg_id)) + theme(axis.text.x = element_text(angle = 90, hjust = 1)))
## Joining, by = "word"
## Selecting by n
plots$p
## [[1]]
##
## [[2]]
##
## [[3]]
##
## [[4]]
##
## [[5]]
##
## [[6]]
##
## [[7]]
##
## [[8]]
##
## [[9]]
##
## [[10]]
##
## [[11]]
##
## [[12]]
##
## [[13]]
##
## [[14]]
les_miserable = gutenberg_download(c(135))
pos = get_sentiments("nrc") %>% filter(sentiment == 'positive')
neg = get_sentiments("nrc") %>% filter(sentiment == 'negative')
les_miserable = split(les_miserable, rep(1:200, nrow(les_miserable) / 200))
## Warning in split.default(x = seq_len(nrow(x)), f = f, drop = drop, ...):
## data length is not a multiple of split variable
posi = vector(length = 200)
negi = vector(length = 200)
for(i in 1:200){
ps = les_miserable[[i]] %>% unnest_tokens(word, text) %>% anti_join(stop_words, by = 'word') %>% inner_join(pos)
ng = les_miserable[[i]] %>% unnest_tokens(word, text) %>% anti_join(stop_words, by = 'word') %>% inner_join(neg)
posi[i] = nrow(ps)
negi[i] = nrow(ng)
}
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
highchart() %>% hc_xAxis(categories = 1:200) %>% hc_add_series(name = "Negative", data = as.integer(negi)) %>% hc_add_series(name = "Positive", data = as.integer(posi))
all_bigrams = dickens %>% unnest_tokens(bigram, text, token = 'ngrams', n = 2) %>% separate(bigram, c('word1', 'word2'), sep = ' ')
all_bigrams %>% filter(!word1 %in% stop_words$word) %>% filter(!word2 %in% stop_words$word) %>% count(word1, word2, sort = TRUE) %>% top_n(30) %>% mutate(bigram = paste(word1, word2, sep=" ")) %>% hchart(., "column", hcaes(x = bigram, y = n)) %>% hc_add_theme(hc_theme_monokai()) %>% hc_yAxis(title = list(text = "Number of Occurance"))
## Selecting by n
all_bigrams %>% filter((word1 == 'he' & !word2 %in% stop_words$word) | (word1 == 'she' & !word2 %in% stop_words$word)) %>% count(word1, word2, sort = TRUE) %>% top_n(60) %>% mutate(bigram = paste(word1, word2, sep=' ')) %>% hchart(., 'column', hcaes(x = bigram, y = n)) %>% hc_add_theme(hc_theme_monokai()) %>% hc_yAxis(title = list(text = 'Number of Occurance'))
## Selecting by n
chapters = dickens %>% group_by(gutenberg_id) %>% mutate(chapter = cumsum(str_detect(text, regex('^chapter ', ignore_case = TRUE)))) %>% ungroup() %>% filter(chapter > 0) %>% unnest_tokens(word, text) %>% anti_join(stop_words)
## Joining, by = "word"
gutID = unique(chapters$gutenberg_id)
chID = unique(chapters$chapter)
allChapters = vector("list", 2000)
cnt = 0
for(i in gutID){
for (j in chID) {
cnt = cnt + 1;
df = chapters %>% filter(chapter == j & gutenberg_id == i)
if(nrow(df) != 0){
chapter_words = df %>% select(word) %>% ungroup() %>% unlist() %>% unname()
chapter_char = chapter_words %>% str_split("") %>% unlist
allChapters[[cnt]] = chapter_char
}
}
}
allChapters = allChapters[!sapply(allChapters, is.null)]
OG = allChapters
for (i in seq(OG)) {
OG[[i]] = OG[[i]] %>% str_replace_all('\'', '')
OG[[i]] = OG[[i]] %>% str_replace_all('_', '')
OG[[i]] = OG[[i]] %>% str_replace_all('`', '')
OG[[i]] = OG[[i]] %>% str_replace_all('[:punct:]]', '')
OG[[i]] = OG[[i]] %>% str_replace_all("[0-9]", "")
oneGram = data.frame(ch = OG[[i]], stringsAsFactors = FALSE)
OG[[i]] = oneGram %>% group_by(ch) %>% summarise(count = n()) %>% ungroup()
OG[[i]] = OG[[i]][!(OG[[i]]$ch == ''), ]
}
hchart(OG[[1]], 'column', hcaes(x = ch, y = count)) %>% hc_add_theme(hc_theme_monokai()) %>% hc_yAxis(title = list(text = 'Number of Occurance')) %>% hc_title(text = '1-grams for sample chapter')
hchart(OG[[800]], 'column', hcaes(x = ch, y = count)) %>% hc_add_theme(hc_theme_monokai()) %>% hc_yAxis(title = list(text = 'Number of Occurance')) %>% hc_title(text = '1-grams for another sample chapter')
TG = allChapters
for (i in seq(TG)) {
TG[[i]] = TG[[i]] %>% str_replace_all('\'', '')
TG[[i]] = TG[[i]] %>% str_replace_all('_', '')
TG[[i]] = TG[[i]] %>% str_replace_all('`', '')
TG[[i]] = TG[[i]] %>% str_replace_all('[:punct:]]', '')
TG[[i]] = TG[[i]] %>% str_replace_all("[0-9]", "")
twoGram = data.frame(ch1 = TG[[i]], ch2 = lead(TG[[i]]), stringsAsFactors = FALSE)
TG[[i]] = twoGram %>% group_by(ch1, ch2) %>% summarise(count = n()) %>% ungroup()
TG[[i]] = TG[[i]][!(TG[[i]]$ch1 == '' | TG[[i]]$ch2 == ''), ]
TG[[i]] = TG[[i]] %>% mutate(tg = paste(ch1, ch2, sep = ''))
}
hchart(TG[[1]], 'column', hcaes(x = tg, y = count)) %>% hc_add_theme(hc_theme_monokai()) %>% hc_yAxis(title = list(text = 'Number of Occurance')) %>% hc_title(text = '2-grams for sample chapter')
hchart(TG[[800]], 'column', hcaes(x = tg, y = count)) %>% hc_add_theme(hc_theme_monokai()) %>% hc_yAxis(title = list(text = 'Number of Occurance')) %>% hc_title(text = '2-grams for another sample chapter')
hugo_books = c(135, 2523, 2610, 6539, 8775, 10381, 12587, 20580)
hugo = gutenberg_download(hugo_books)
## Warning in .f(.x[[i]], ...): Could not download a book at http://
## aleph.gutenberg.org/6/5/3/6539/6539.zip
chapters = hugo %>% group_by(gutenberg_id) %>% mutate(chapter = cumsum(str_detect(text, regex('^chapter ', ignore_case = TRUE)))) %>% ungroup() %>% filter(chapter > 0) %>% unnest_tokens(word, text) %>% anti_join(stop_words)
## Joining, by = "word"
gutID = unique(chapters$gutenberg_id)
chID = unique(chapters$chapter)
allChapters = vector("list", 2000)
cnt = 0
for(i in gutID){
for (j in chID) {
cnt = cnt + 1;
df = chapters %>% filter(chapter == j & gutenberg_id == i)
if(nrow(df) != 0){
chapter_words = df %>% select(word) %>% ungroup() %>% unlist() %>% unname()
chapter_char = chapter_words %>% str_split("") %>% unlist
allChapters[[cnt]] = chapter_char
}
}
}
allChapters = allChapters[!sapply(allChapters, is.null)]
OG = allChapters
for (i in seq(OG)) {
OG[[i]] = OG[[i]] %>% str_replace_all('\'', '')
OG[[i]] = OG[[i]] %>% str_replace_all('_', '')
OG[[i]] = OG[[i]] %>% str_replace_all('`', '')
OG[[i]] = OG[[i]] %>% str_replace_all('[:punct:]]', '')
OG[[i]] = OG[[i]] %>% str_replace_all("[0-9]", "")
oneGram = data.frame(ch = OG[[i]], stringsAsFactors = FALSE)
OG[[i]] = oneGram %>% group_by(ch) %>% summarise(count = n()) %>% ungroup()
OG[[i]] = OG[[i]][!(OG[[i]]$ch == ''), ]
}
hchart(OG[[1]], 'column', hcaes(x = ch, y = count)) %>% hc_add_theme(hc_theme_monokai()) %>% hc_yAxis(title = list(text = 'Number of Occurance')) %>% hc_title(text = '1-grams for sample chapter')
hchart(OG[[600]], 'column', hcaes(x = ch, y = count)) %>% hc_add_theme(hc_theme_monokai()) %>% hc_yAxis(title = list(text = 'Number of Occurance')) %>% hc_title(text = '1-grams for another sample chapter')
TG = allChapters
for (i in seq(TG)) {
TG[[i]] = TG[[i]] %>% str_replace_all('\'', '')
TG[[i]] = TG[[i]] %>% str_replace_all('_', '')
TG[[i]] = TG[[i]] %>% str_replace_all('`', '')
TG[[i]] = TG[[i]] %>% str_replace_all('[:punct:]]', '')
TG[[i]] = TG[[i]] %>% str_replace_all("[0-9]", "")
twoGram = data.frame(ch1 = TG[[i]], ch2 = lead(TG[[i]]), stringsAsFactors = FALSE)
TG[[i]] = twoGram %>% group_by(ch1, ch2) %>% summarise(count = n()) %>% ungroup()
TG[[i]] = TG[[i]][!(TG[[i]]$ch1 == '' | TG[[i]]$ch2 == ''), ]
TG[[i]] = TG[[i]] %>% mutate(tg = paste(ch1, ch2, sep = ''))
}
hchart(TG[[1]], 'column', hcaes(x = tg, y = count)) %>% hc_add_theme(hc_theme_monokai()) %>% hc_yAxis(title = list(text = 'Number of Occurance')) %>% hc_title(text = '2-grams for sample chapter')
hchart(TG[[600]], 'column', hcaes(x = tg, y = count)) %>% hc_add_theme(hc_theme_monokai()) %>% hc_yAxis(title = list(text = 'Number of Occurance')) %>% hc_title(text = '2-grams for another sample chapter')